Data Source: https://www.kaggle.com/praveengovi/emotions-dataset-for-nlp
import pandas as pd
# visualization
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.graph_objs import *
# to avoid warnings
import warnings
warnings.filterwarnings("ignore")
# text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import word2vec
# Keras imports
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout, LSTM
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Scikit Learn imports
from sklearn.model_selection import train_test_split
set_stop_words = set(stopwords.words("english"))
train_data = pd.read_csv('Data/train.txt', sep=';', names=['text', 'emotion'])
train_data.head()
| text | emotion | |
|---|---|---|
| 0 | i didnt feel humiliated | sadness |
| 1 | i can go from feeling so hopeless to so damned... | sadness |
| 2 | im grabbing a minute to post i feel greedy wrong | anger |
| 3 | i am ever feeling nostalgic about the fireplac... | love |
| 4 | i am feeling grouchy | anger |
test_data = pd.read_csv('Data/test.txt', sep=';', names=['text', 'emotion'])
test_data.head()
| text | emotion | |
|---|---|---|
| 0 | im feeling rather rotten so im not very ambiti... | sadness |
| 1 | im updating my blog because i feel shitty | sadness |
| 2 | i never make her separate from me because i do... | sadness |
| 3 | i left with my bouquet of red and yellow tulip... | joy |
| 4 | i was feeling a little vain when i did this one | sadness |
val_data = pd.read_csv('Data/test.txt', sep=';', names=['text', 'emotion'])
val_data.head()
| text | emotion | |
|---|---|---|
| 0 | im feeling rather rotten so im not very ambiti... | sadness |
| 1 | im updating my blog because i feel shitty | sadness |
| 2 | i never make her separate from me because i do... | sadness |
| 3 | i left with my bouquet of red and yellow tulip... | joy |
| 4 | i was feeling a little vain when i did this one | sadness |
Seems like the test and validation dataset have the same data.
len(train_data), len(test_data), len(val_data)
(16000, 2000, 2000)
if not train_data.isna().sum().any() and not test_data.isna().sum().any():
print("No Null Values detected")
else:
print("Null Values detected")
No Null Values detected
emotions_order = list(train_data.emotion.unique())
emotions_order
['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
emotions_color = ['blue', 'red', 'pink', 'green', 'grey', 'yellow']
dict_emo_color = {}
for each_emo, each_color in zip(emotions_order, emotions_color):
dict_emo_color[each_emo] = each_color
dict_emo_color
{'sadness': 'blue',
'anger': 'red',
'love': 'pink',
'surprise': 'green',
'fear': 'grey',
'joy': 'yellow'}
fig = px.histogram(train_data,
x="emotion",
template='plotly_white',
color='emotion',
category_orders = {'emotion': emotions_order}
)
fig.update_layout(
title={
'text': "Row Count of <b>Train Data</b> by Emotion Tags",
'x' : 0.5,
'xanchor': 'center'
})
fig.update_yaxes(title='Number of Rows').update_xaxes(title='Emotions')
fig.show()
fig = px.histogram(test_data,
x="emotion",
template='plotly_white',
color='emotion',
category_orders={'emotion': emotions_order})
fig.update_layout(
title={
'text': "Row Count of <b>Test Data</b> by Emotion Tags",
'x': 0.5,
'xanchor': 'center'
})
fig.update_yaxes(title='Number of Rows').update_xaxes(title='Emotions')
fig.show()
train_data["emotion"] = train_data["emotion"].astype('category')
train_data["emotion_label"] = train_data["emotion"].cat.codes
train_data.head()
| text | emotion | emotion_label | |
|---|---|---|---|
| 0 | i didnt feel humiliated | sadness | 4 |
| 1 | i can go from feeling so hopeless to so damned... | sadness | 4 |
| 2 | im grabbing a minute to post i feel greedy wrong | anger | 0 |
| 3 | i am ever feeling nostalgic about the fireplac... | love | 3 |
| 4 | i am feeling grouchy | anger | 0 |
test_data["emotion"] = test_data["emotion"].astype('category')
test_data["emotion_label"] = test_data["emotion"].cat.codes
test_data.head()
| text | emotion | emotion_label | |
|---|---|---|---|
| 0 | im feeling rather rotten so im not very ambiti... | sadness | 4 |
| 1 | im updating my blog because i feel shitty | sadness | 4 |
| 2 | i never make her separate from me because i do... | sadness | 4 |
| 3 | i left with my bouquet of red and yellow tulip... | joy | 2 |
| 4 | i was feeling a little vain when i did this one | sadness | 4 |
train_features, train_labels = train_data['text'], tf.one_hot(
train_data["emotion_label"], 6)
test_features, test_labels = test_data['text'], tf.one_hot(
test_data["emotion_label"], 6)
train_features[:5]
0 i didnt feel humiliated 1 i can go from feeling so hopeless to so damned... 2 im grabbing a minute to post i feel greedy wrong 3 i am ever feeling nostalgic about the fireplac... 4 i am feeling grouchy Name: text, dtype: object
train_labels[:5]
<tf.Tensor: shape=(5, 6), dtype=float32, numpy=
array([[0., 0., 0., 0., 1., 0.],
[0., 0., 0., 0., 1., 0.],
[1., 0., 0., 0., 0., 0.],
[0., 0., 0., 1., 0., 0.],
[1., 0., 0., 0., 0., 0.]], dtype=float32)>
tokenized_train_features = [word_tokenize(each_train_text) for each_train_text in train_features]
tokenized_test_features = [word_tokenize(each_test_text) for each_test_text in test_features]
list_len_text_by_words = [len(each_tokenized_text) for each_tokenized_text in tokenized_train_features]
fig = px.histogram(
x=list_len_text_by_words,
template='plotly_white'
)
fig.update_layout(
title={
'text': "Histogram of <b>Sentence Length</b> by Number of Words",
'x': 0.4,
'xanchor': 'center'
})
fig.update_yaxes(title='Frequency').update_xaxes(
title='Length of Sentences by Words')
fig.update_layout(showlegend=False)
fig.update_layout(hovermode='x')
fig.show()
vector_size = 300
w2v_model = word2vec.Word2Vec(
tokenized_train_features + tokenized_test_features,
vector_size=vector_size, # Dimensionality of the word vectors
window=15,
min_count=1,
sg=1 # 1 for skip-gram; otherwise CBOW
)
w2v_model
<gensim.models.word2vec.Word2Vec at 0x243dfa76640>
# Vocabulary
len(w2v_model.wv)
16182
# Similar Terms Check
w2v_model.wv.most_similar('sad', topn=10)
[('lonely', 0.9364967942237854),
('confused', 0.9065178632736206),
('unhappy', 0.9057068824768066),
('angry', 0.9040181040763855),
('horribly', 0.8974623084068298),
('shit', 0.893015444278717),
('coz', 0.8929088711738586),
('upset', 0.8924469351768494),
('bitchy', 0.8902505040168762),
('typing', 0.8889704346656799)]
w2v_model.wv.most_similar('happy', topn=10)
[('contented', 0.8968585729598999),
('whats', 0.8929765224456787),
('space', 0.876916229724884),
('birthday', 0.8740570545196533),
('excited', 0.8713725805282593),
('report', 0.8704681396484375),
('emo', 0.8684948682785034),
('active', 0.8665992021560669),
('haha', 0.8660399317741394),
('apart', 0.865911602973938)]
w2v_model.wv.most_similar(['sad', 'feel'], topn=10)
[('shit', 0.9060815572738647),
('damn', 0.8845294117927551),
('horribly', 0.8832857012748718),
('upset', 0.8816204071044922),
('someones', 0.876876711845398),
('worthless', 0.8762528896331787),
('bored', 0.871407687664032),
('overly', 0.8712095618247986),
('worried', 0.871063232421875),
('unwanted', 0.8698233962059021)]
w2v_model.wv.most_similar('sad', negative='lonely', topn=10)
[('us', 0.2892538011074066),
('feelings', 0.2728334665298462),
('him', 0.25731393694877625),
('u', 0.2539515197277069),
('women', 0.24864810705184937),
('wrong', 0.24588553607463837),
('they', 0.2457340508699417),
('hurt', 0.23416301608085632),
('talk', 0.22973524034023285),
('them', 0.21976956725120544)]
# Configs for Embedding Layer
vocab_size = len(w2v_model.wv)
max_seq_len = 20
# Get the embedding matrix
vocab = w2v_model.wv.key_to_index.keys()
embedding_matrix = w2v_model.wv[vocab]
# Use PCA or T-SNE for reducing the dimension to two
from sklearn.decomposition import PCA
def plot_similarity_PCA(model, word_vector):
pca = PCA(n_components=2)
result = pca.fit_transform(word_vector)
print(result.shape)
plt.scatter(
result[:, 0], # column (dimesion) 1
result[:, 1], # column (dimension) 2
color='b'
)
# annotation or printing words in the plot
for i, word in enumerate(vocab):
plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()
plt.figure(figsize=[10, 10])
plot_similarity_PCA(w2v_model, w2v_model.wv[vocab])
(16182, 2)
or use Embedding Projector by Google - http://projector.tensorflow.org/
Simply upload your model and visualize the learned embeddings. Much better than this diagram.
w2v_model.wv.get_vector('happy')
array([ 0.0758569 , 0.02622177, -0.06680174, -0.09259956, -0.03067747,
-0.19519806, 0.01492052, 0.24110462, -0.12859498, -0.05598032,
0.154052 , -0.12078906, -0.14642066, 0.15856652, 0.2122258 ,
-0.05670339, 0.18150334, 0.08994725, 0.03474677, 0.05769769,
0.09546447, 0.07675274, 0.03324675, 0.12136482, -0.10291205,
0.07716864, -0.33965936, 0.14395711, -0.11283117, -0.19143796,
-0.10241225, -0.04499802, 0.11366425, 0.16575855, -0.14706653,
0.20255452, 0.03810835, -0.04168792, -0.05104099, 0.01381163,
0.04274488, -0.13745378, -0.01624873, -0.14674994, 0.13411993,
0.08848565, 0.04731587, 0.03462836, 0.25293407, 0.00342069,
-0.07936186, -0.11639327, 0.09869944, 0.04414143, 0.00494947,
-0.01318336, 0.03934562, 0.04339243, 0.0354338 , -0.04482697,
-0.02409558, -0.07898812, 0.03044529, 0.16829969, 0.04442432,
-0.01109935, -0.00917565, 0.27678362, -0.09174865, 0.11277412,
0.09857363, 0.06819192, 0.2245417 , -0.07089665, 0.07044945,
-0.00316063, -0.19881086, 0.06576326, 0.0607991 , 0.09008579,
-0.18144414, -0.12937196, -0.04201027, 0.26608023, 0.00259859,
0.05480948, 0.14676768, -0.08020865, 0.03065497, 0.08345652,
0.12882316, -0.12527655, -0.07038872, 0.10973712, 0.216197 ,
0.16930123, 0.06663705, -0.09128762, 0.03779356, 0.13744132,
-0.11629711, -0.0087396 , 0.06297549, 0.01467522, 0.04747882,
-0.16908409, -0.02013602, -0.04793166, -0.2733571 , -0.00807715,
-0.23633212, 0.12486058, -0.06892245, -0.0017713 , -0.07570446,
0.17987128, -0.1774472 , 0.03622684, 0.08208895, -0.30130783,
0.02077806, -0.00362255, 0.2617209 , -0.08096551, -0.16767724,
0.23115362, 0.00294005, -0.14465623, -0.0596449 , 0.21916759,
0.08493185, 0.03311376, 0.3848083 , -0.33568135, -0.00933718,
0.30933228, -0.04894128, -0.13998678, -0.2640263 , -0.2634512 ,
0.18363823, -0.12174641, -0.21765094, 0.22417551, -0.06779974,
-0.3049131 , -0.15550137, -0.02165652, 0.05717926, -0.06985344,
-0.00355616, -0.24678369, -0.01335377, -0.27346113, 0.07881541,
0.2442983 , -0.22542892, 0.09441712, 0.05744012, 0.20362607,
-0.08987432, 0.02209077, 0.01543627, 0.17083387, -0.10554256,
0.10917955, -0.05096637, -0.00049419, -0.15260604, 0.3139253 ,
-0.2817775 , 0.1826884 , -0.0246223 , 0.22063306, 0.06734051,
0.08868687, -0.01455367, -0.07061606, 0.14099091, 0.06138005,
-0.13016646, 0.1328046 , -0.2982794 , -0.03442439, -0.06715344,
-0.04644765, 0.36789984, 0.17014179, 0.01981223, -0.04890771,
0.23732093, 0.17068005, -0.12702374, 0.08883484, -0.06205164,
0.04342844, -0.10611929, -0.19379136, 0.00834022, -0.07192267,
-0.11468259, -0.02564187, 0.20929664, -0.08926708, 0.29074863,
-0.27228382, 0.1810195 , 0.17144552, -0.15271804, 0.08513678,
-0.07190081, -0.13992862, 0.00635077, -0.31191415, 0.12140489,
-0.11126767, -0.13563153, -0.394757 , -0.23049864, -0.17862985,
0.1107306 , 0.16715994, -0.08617249, -0.18669863, -0.1932955 ,
-0.19067393, 0.00843216, -0.21824557, -0.18127272, 0.04344313,
0.17914732, -0.04646606, -0.00465821, 0.13895111, -0.15802707,
0.10226937, -0.19731362, -0.09088495, -0.0139476 , -0.35211712,
0.12044787, 0.08315569, -0.0232665 , 0.1422515 , 0.12758158,
-0.16480729, -0.09891806, -0.07297929, 0.03502282, 0.20154566,
0.17573069, -0.07351367, 0.1905679 , -0.0413966 , -0.26891655,
-0.02910962, 0.34913516, 0.09394397, -0.3431017 , -0.09072361,
-0.0102805 , -0.00575538, 0.23253106, -0.30874574, -0.39783692,
0.01984232, 0.14251772, 0.27109843, -0.17652844, 0.09593348,
-0.06872626, -0.03223694, 0.21381785, 0.25692865, -0.04412299,
0.35161087, 0.00885422, 0.15742251, -0.32997215, 0.02528062,
0.02557394, 0.01647614, 0.08075942, 0.00695719, 0.3249548 ,
0.15432318, -0.15325683, -0.05958552, 0.05504843, 0.13607205,
0.13497446, 0.09433284, -0.01920925, 0.02939652, 0.17109282,
0.40231198, -0.11260425, -0.33375183, 0.12056679, -0.15598153],
dtype=float32)
w2v_model.wv.get_vector('happy').shape
(300,)
Gensim API - https://radimrehurek.com/gensim/models/word2vec.html
# final check on the model training parameters
vocab_size, vector_size, max_seq_len
(16182, 300, 20)
def w2v_indexed_token_sequences(w2v_model, list_features):
indexed_features = []
for each_seq in list_features:
list_token_indices = []
for each_token in each_seq:
try:
list_token_indices.append(w2v_model.wv.key_to_index[each_token])
except KeyError as e:
continue
indexed_features.append(list_token_indices)
return indexed_features
indexed_train_features = w2v_indexed_token_sequences(w2v_model, tokenized_train_features)
indexed_test_features = w2v_indexed_token_sequences(w2v_model, tokenized_test_features)
len(indexed_train_features), len(indexed_test_features)
(16000, 2000)
padded_train = pad_sequences(indexed_train_features, padding = 'post', maxlen=max_seq_len, truncating='post')
padded_test = pad_sequences(indexed_test_features, padding = 'post', maxlen=max_seq_len, truncating='post')
# Check all sequences have same length
list_len_text_by_words = [len(each) for each in padded_train]
fig = px.histogram(
x=list_len_text_by_words,
template='plotly_white'
)
fig.update_layout(
title={
'text': "Histogram of <b>Sentence Length</b> by Number of Words",
'x': 0.4,
'xanchor': 'center'
})
fig.update_yaxes(title='Frequency').update_xaxes(
title='Length of Sentences by Words')
fig.update_layout(showlegend=False)
fig.update_layout(hovermode='x')
fig.show()
def get_model():
model = Sequential()
model.add(
Embedding(input_dim=vocab_size,
output_dim=vector_size,
weights=[embedding_matrix],
input_length=max_seq_len))
model.add(Dropout(0.6))
model.add(LSTM(max_seq_len,return_sequences=True))
model.add(LSTM(6))
model.add(Dense(6,activation='softmax'))
return model
model = get_model()
model.summary()
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= embedding (Embedding) (None, 20, 300) 4854600 _________________________________________________________________ dropout (Dropout) (None, 20, 300) 0 _________________________________________________________________ lstm (LSTM) (None, 20, 20) 25680 _________________________________________________________________ lstm_1 (LSTM) (None, 6) 648 _________________________________________________________________ dense (Dense) (None, 6) 42 ================================================================= Total params: 4,880,970 Trainable params: 4,880,970 Non-trainable params: 0 _________________________________________________________________
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
tf.config.run_functions_eagerly(True)
history = model.fit(padded_train,
train_labels,
validation_split=0.33,
epochs=5)
Epoch 1/5 335/335 [==============================] - 99s 296ms/step - loss: 1.5966 - accuracy: 0.3185 - val_loss: 1.4387 - val_accuracy: 0.4518 Epoch 2/5 335/335 [==============================] - 82s 244ms/step - loss: 1.2922 - accuracy: 0.5273 - val_loss: 1.1242 - val_accuracy: 0.5766 Epoch 3/5 335/335 [==============================] - 81s 241ms/step - loss: 0.9341 - accuracy: 0.6580 - val_loss: 0.7294 - val_accuracy: 0.7930 Epoch 4/5 335/335 [==============================] - 81s 242ms/step - loss: 0.5259 - accuracy: 0.8654 - val_loss: 0.6002 - val_accuracy: 0.8284 Epoch 5/5 335/335 [==============================] - 81s 242ms/step - loss: 0.3411 - accuracy: 0.9115 - val_loss: 0.6066 - val_accuracy: 0.8292
metric_to_plot = "loss"
epochs = list(range(1, max(history.epoch) + 2))
training_loss = history.history[metric_to_plot]
validation_loss = history.history["val_" + metric_to_plot]
trace1 = {
"mode": "lines+markers",
"name": "Training Loss",
"type": "scatter",
"x": epochs,
"y": training_loss
}
trace2 = {
"mode": "lines+markers",
"name": "Validation Loss",
"type": "scatter",
"x": epochs,
"y": validation_loss
}
data = Data([trace1, trace2])
layout = {
"title": "Training - Validation Loss",
"xaxis": {
"title": "Number of epochs",
"titlefont": {
"size": 18,
"color": "#7f7f7f"
}
},
"yaxis": {
"title": "Loss",
"titlefont": {
"size": 18,
"color": "#7f7f7f"
}
}
}
fig = Figure(data=data, layout=layout)
fig.update_layout(hovermode="x unified")
fig.show()
metric_to_plot = "accuracy"
epochs = list(range(1, max(history.epoch) + 2))
training_loss = history.history[metric_to_plot]
validation_loss = history.history["val_" + metric_to_plot]
trace1 = {
"mode": "lines+markers",
"name": "Training Accuracy",
"type": "scatter",
"x": epochs,
"y": training_loss
}
trace2 = {
"mode": "lines+markers",
"name": "Validation Accuracy",
"type": "scatter",
"x": epochs,
"y": validation_loss
}
data = Data([trace1, trace2])
layout = {
"title": "Training - Validation Accuracy",
"xaxis": {
"title": "Number of epochs",
"titlefont": {
"size": 18,
"color": "#7f7f7f"
}
},
"yaxis": {
"title": "Accuracy",
"titlefont": {
"size": 18,
"color": "#7f7f7f"
}
}
}
fig = Figure(data=data, layout=layout)
fig.update_layout(hovermode="x unified")
fig.show()
y_pred_one_hot_encoded = (model.predict(padded_train)> 0.5).astype("int32")
y_pred_one_hot_encoded
array([[0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 1, 0],
[1, 0, 0, 0, 0, 0],
...,
[0, 0, 1, 0, 0, 0],
[1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0]])
y_pred = np.array(tf.argmax(y_pred_one_hot_encoded, axis=1))
from sklearn.metrics import classification_report
print(classification_report(train_data['emotion_label'], y_pred))
precision recall f1-score support
0 0.76 0.90 0.83 2159
1 0.90 0.90 0.90 1937
2 0.93 0.95 0.94 5362
3 0.85 0.82 0.84 1304
4 0.97 0.92 0.95 4666
5 0.94 0.58 0.72 572
accuracy 0.90 16000
macro avg 0.89 0.85 0.86 16000
weighted avg 0.91 0.90 0.90 16000
y_pred_one_hot_encoded = (model.predict(padded_test)> 0.5).astype("int32")
y_pred = np.array(tf.argmax(y_pred_one_hot_encoded, axis=1))
print(classification_report(test_data['emotion_label'], y_pred))
precision recall f1-score support
0 0.61 0.81 0.70 275
1 0.81 0.79 0.80 224
2 0.85 0.86 0.85 695
3 0.65 0.67 0.66 159
4 0.92 0.82 0.86 581
5 0.88 0.42 0.57 66
accuracy 0.80 2000
macro avg 0.79 0.73 0.74 2000
weighted avg 0.82 0.80 0.80 2000